import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
### so that u dont have warnings
from warnings import filterwarnings
filterwarnings('ignore')
#read dataset
df=pd.read_csv('D:\python project\Zomato Data Analysis/zomato.csv')
df.head()
df.shape
df.info()
df.isnull().sum()
analyst 'rate'
#drop the missing value in 'rate'column
df.dropna(axis='index',subset=['rate'],inplace=True)# have to remove it in a horizontal way,and use subset parameter and replace data
df['rate'].unique()
def split(x):#define function(parameter)
return x.split('/')[0]#use this function
df['rate']=df['rate'].apply(split)#use this function to excute each rows or each columns
df['rate'].unique()
df.replace('NEW',0,inplace=True)#use 0 to replace new
df.replace('-',0,inplace=True)#use 0 to replace -
df['rate']=df['rate'].astype(float)#change a type of this colum
calculate avg rating of each resturant
df.groupby('name')['rate'].mean()
df.groupby('name')['rate'].mean().to_frame()
df_rate=df.groupby('name')['rate'].mean().to_frame().reset_index()
df_rate
rename the columns
df_rate.columns=['restaurant','avg_rating']
df_rate.head(20)
sns.set_style(style='whitegrid')#白色网格形式
sns.distplot(df_rate['avg_rating'])
resume:Almost more than 50 percent of restaurants has rating between 3 and 4. Restaurants having rating more than 4.5 are very rare
chains=df['name'].value_counts()[0:20]
chains
plt.figure(figsize=(10,7))
chains=df['name'].value_counts()[0:20]
sns.barplot(x=chains,y=chains.index,palette='deep')#chains=number,chains.index=name of res
plt.title("Most famous restaurants chains in Bangaluru")
plt.xlabel("Number of outlets")
x=df['online_order'].value_counts()
x
!pip install plotly
import plotly.express as px
x=df['online_order'].value_counts()
labels=['accepted','not accepted']
fig = px.pie(df, values=x, names=labels,title='Pie chart')
fig.show()
# another way to draw
x=df['online_order'].value_counts()
labels=['accepted','not accepted']
plt.pie(x,explode=[0.0,0.1],autopct='%1.1f%%')#Keep one decimal place
x=df['book_table'].value_counts()
x
import plotly.graph_objs as go
from plotly.offline import iplot
x=df['book_table'].value_counts()
labels=['not book','book']
trace=go.Pie(labels=labels, values=x,
hoverinfo='label+percent', textinfo='value',
textfont=dict(size=25),
pull=[0, 0, 0,0.2, 0]
)
iplot([trace])
#another way to draw
x=df['book_table'].value_counts()
labels=['not book','book']
plt.pie(x,explode=[0.0,0.1],autopct='%1.1f%%')
df['rest_type'].isna().sum()
df['rest_type'].dropna(inplace=True)
len(df['rest_type'].unique())#have 87 restanrants types
#data is represented in form of percentage
(df['rest_type'].value_counts()/len(df))*100
plt.figure(figsize=(20,12))
df['rest_type'].value_counts().nlargest(20).plot.bar(color='red')
plt.gcf().autofmt_xdate()
# another way to draw
trace1 = go.Bar(
x = df['rest_type'].value_counts().nlargest(20).index,
y = df['rest_type'].value_counts().nlargest(20),
name= 'rest_type')
iplot([trace1])
df.groupby('name')['votes'].sum().nlargest(10).plot.bar()
#another way to draw
trace1 = go.Bar(
x = df.groupby('name')['votes'].sum().nlargest(10).index,
y = df.groupby('name')['votes'].sum().nlargest(10))
iplot([trace1])
df.isnull().sum()
df.groupby('location')['name'].unique()
location=[] #key
restaurant=[] #value
for key,location_df in df.groupby('location'):
location.append(key)
restaurant.append(len(location_df['name'].unique()))
df_total=pd.DataFrame(zip(location,restaurant))
df_total.columns=['location','restaurant']# rename
df_total.head()
df_total.set_index('location',inplace=True)
df_total.sort_values(by='restaurant').tail(10)# display the last 10 of dataset
df_total.sort_values(by='restaurant').tail(10).plot.bar()
df_total.sort_values(by='restaurant').tail(10).index
trace1 = go.Bar(
x = df_total['restaurant'].nlargest(10).index,
y = df_total['restaurant'].nlargest(10),
name= 'Priority')
iplot([trace1])
cuisines=df['cuisines'].value_counts()[:10]#top 10 of number of cuisines
cuisines
sns.barplot(cuisines,cuisines.index)
plt.xlabel('Count')
plt.title("Most popular cuisines of Bangalore")
#another way to draw
cuisines=df['cuisines'].value_counts()[:10]
trace1 = go.Bar(
x = cuisines.index,
y = cuisines,
name= 'Cuisines')
iplot([trace1])
resume: We can observe that North Indian,chinese,South Indian and Biriyani are most common.
len(df['approx_cost(for two people)'].value_counts())
df['approx_cost(for two people)'].isna().sum()
df.dropna(axis=0,subset=['approx_cost(for two people)'],inplace=True)
# axis=0,是指删除行;subset指的是限定函数作用范围,只在[‘col1’,‘col2’…]列内
df['approx_cost(for two people)'].isna().sum()
df['approx_cost(for two people)'].unique()
df['approx_cost(for two people)']=df['approx_cost(for two people)'].apply(lambda x:x.replace(',',''))
#把上面数值中有逗号的去掉逗号
df['approx_cost(for two people)'].unique()
df['approx_cost(for two people)']=df['approx_cost(for two people)'].astype(int)
df['approx_cost(for two people)'].dtype
df['approx_cost(for two people)']
sns.distplot(df['approx_cost(for two people)'])
#another way to draw
px.histogram(df, x="approx_cost(for two people)")
resume:1.if customers have 2,they usully spend the cost about 500 2.most of the price lies between in a range of under 1000,it means most are affordable & very few are luxurious
df['approx_cost(for two people)'].min()
df['approx_cost(for two people)'].max()
df[df['approx_cost(for two people)']==6000]
df[df['approx_cost(for two people)']==6000]['name']
plt.figure(figsize=(10,7))
sns.scatterplot(x="rate",y='approx_cost(for two people)',hue='online_order',data=df)
plt.show()
resume:the higher rate ,the higher cost (for 2 people)
fig = px.box(df,x='online_order',y='approx_cost(for two people)')
fig.show()
df[df['approx_cost(for two people)']==6000].loc[:,('name','cuisines','dish_liked')]
# or df[df['approx_cost(for two people)']==6000][['name','cuisines','dish_liked']]
data=df.copy()
data.dtypes
data.set_index('name',inplace=True)
data['approx_cost(for two people)'].nlargest(10).plot.bar()
#最贵的餐厅里有多少道菜最贵,价格是多少
# another way to draw
trace1 = go.Bar(
x = data['approx_cost(for two people)'].nlargest(10).index,
y = data['approx_cost(for two people)'].nlargest(10),
name= 'Priority')
iplot([trace1])
data['approx_cost(for two people)'].nsmallest(10).plot.bar()
trace1 = go.Bar(
x = data['approx_cost(for two people)'].nsmallest(10).index,
y = data['approx_cost(for two people)'].nsmallest(10),
name= 'Priority')
iplot([trace1])
data.set_index('location',inplace=True)
data['approx_cost(for two people)'].nsmallest(10)
data[data['approx_cost(for two people)']<=500]
df_budget=data[data['approx_cost(for two people)']<=500].loc[:,('approx_cost(for two people)')]
df_budget=df_budget.reset_index()
df_budget.head()
df_budget['approx_cost(for two people)'].value_counts().plot.bar()
trace1 = go.Bar(
x = df_budget['approx_cost(for two people)'].value_counts().index,
y = df_budget['approx_cost(for two people)'].value_counts(),
name= 'Priority')
iplot([trace1])
resume:we can see the dishes of cost around 300-400 are maximum in count
df_new=df[(df['rate']>=4) & (df['approx_cost(for two people)']<=500)]
len(df_new['name'].unique())
location=[] #key
total=[]#value
for loc,location_df in df_new.groupby('location'):
location.append(loc)
total.append(len(location_df['name'].unique()))
len(location)
len(total)
location_df=pd.DataFrame(zip(location,total))
location_df.columns=['location','restaurant']
location_df.set_index('location',inplace=True)
location_df.head(20)
location_df['restaurant'].nlargest(10).plot.bar()
plt.gcf().autofmt_xdate()
plt.ylabel('Total restaurants')
#another way to draw
trace1 = go.Bar(
x = location_df['restaurant'].nlargest(10).index,
y = location_df['restaurant'].nlargest(10),
name= 'Priority')
iplot([trace1])
to visualise what are the names of restaurants
location=[]
total=[]
for loc,location_df in df_new.groupby('location'):
location.append(loc)
total.append(location_df['name'].unique())
afford=pd.DataFrame(zip(location,total))
afford.columns=['location','res_names']
afford.set_index('location',inplace=True)
afford.head()
def return_budget(location,restaurant):
budget=df[(df['approx_cost(for two people)']<=400) & (df['location']==location) &
(df['rate']>4) & (df['rest_type']==restaurant)]
return(budget['name'].unique())
return_budget('BTM',"Quick Bites")
plt.figure(figsize=(10,7))
Restaurant_locations=df['location'].value_counts()[:20]
sns.barplot(Restaurant_locations,Restaurant_locations.index)
#another way to draw
Restaurant_locations=df['location'].value_counts()[:20]
trace1 = go.Bar(
x = Restaurant_locations.index,
y = Restaurant_locations,
name= 'Priority')
iplot([trace1])
resume:We can see that BTM,HSR and Koranmangala 5th block has the most number of restaurants
I need Latitudes & longitudes for each of the place for geaographical Data analysis,so to fetch lat,lon of each place,use Geopy
locations=pd.DataFrame({"Name":df['location'].unique()})
locations['new_Name']='Bangalore '+locations['Name']
locations.head()
!pip install geopy
from geopy.geocoders import Nominatim
lat_lon=[]
geolocator=Nominatim(user_agent="app")
for location in locations['Name']:
location = geolocator.geocode(location)
if location is None:
lat_lon.append(np.nan)
else:
geo=(location.latitude,location.longitude)
lat_lon.append(geo)
locations['geo_loc']=lat_lon
locations.head()
locations.to_csv('zomato_locations.csv',index=False)
We have found out latitude and longitude of each location listed in the dataset using geopy.
# This is used to plot maps
Rest_locations=pd.DataFrame(df['location'].value_counts().reset_index())
Rest_locations.columns=['Name','count']
Rest_locations.head()
# now combine both the dataframes
locations.head()
Restaurant_locations=Rest_locations.merge(locations,on='Name',how="left").dropna()
Restaurant_locations.head()
Restaurant_locations.shape
Restaurant_locations['count'].max()
type(Restaurant_locations['geo_loc'][0])
def generateBaseMap(default_location=[12.97, 77.59], default_zoom_start=12):
base_map = folium.Map(location=default_location, zoom_start=default_zoom_start)
return base_map
len(Restaurant_locations['geo_loc'])
Restaurant_locations.isna().sum()
Restaurant_locations['geo_loc'][0][0]
Restaurant_locations['geo_loc'][0][1]
np.array(Restaurant_locations['geo_loc'])
lat,lon=zip(*np.array(Restaurant_locations['geo_loc']))
Restaurant_locations['lat']=lat
Restaurant_locations['lon']=lon
Restaurant_locations.head()
!pip install folium
import folium
from folium.plugins import HeatMap
basemap=generateBaseMap()
basemap
Restaurant_locations[['lat','lon','count']].values.tolist()
HeatMap(Restaurant_locations[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap
resume:It is clear that restaurants tend to concentrate in central bangalore area.¶ The clutter of restaurants lowers are we move away from central. So,potential restaurant entrepreneurs can refer this and find out good locations for their venture
df.head()
df2= df[df['cuisines']=='North Indian']
df2.head()
north_india=df2.groupby(['location'],as_index=False)['url'].agg('count')
north_india.columns=['Name','count']
north_india.head()
north_india=north_india.merge(locations,on="Name",how='left').dropna()
north_india.head()
north_india['lan'],north_india['lon']=zip(*north_india['geo_loc'].values)
north_india.drop(['geo_loc'],axis=1)
basemap=generateBaseMap()
HeatMap(north_india[['lan','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap
What about South Indian cuisines?
df3= df[df['cuisines']=='South Indian']
south_india=df2.groupby(['location'],as_index=False)['url'].agg('count')
south_india.columns=['Name','count']
south_india=south_india.merge(locations,on="Name",how='left').dropna()
south_india['lan'],south_india['lon']=zip(*south_india['geo_loc'].values)
south_india=south_india.drop(['geo_loc'],axis=1)
south_india.head()
basemap=generateBaseMap()
HeatMap(south_india[['lan','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap
df_1=df.groupby(['rest_type','name']).agg('count')
datas=df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],
as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False).head(3))['url'].reset_index().rename(columns={'url':'count'})
datas
df_1=df.groupby(['rest_type','name']).agg('count')
df_1
df_1.sort_values(['url'],ascending=False)
df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))
df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))['url']
df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))['url'].reset_index()
dataset=df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],
as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))['url'].reset_index().rename(columns={'url':'count'})
dataset
casual=dataset[dataset['rest_type']=='Casual Dining']
casual
We can see tht Empire restaurant,Beijing bites and Mani's dum biriyani are the most popular casual dining restaurant chains in Bangalore. We will inspect them further...
!pip install wordcloud
from wordcloud import WordCloud
df.head()
df['update_dish_liked']=df['dish_liked'].apply(lambda x : x.split(',') if type(x)==str else [''])
df.head()
df['rest_type'].value_counts()[:9].index
create wordcloud for each and every restaurant
from wordcloud import WordCloud, STOPWORDS
df.isna().sum()
df.dropna(axis='index',subset=['rest_type'],inplace=True)
df.dropna(axis='index',subset=['dish_liked'],inplace=True)#axis='index相当于axis=0'
df.isna().sum()
wordcloud for 1 restaurant
data=df[df['rest_type']=='Quick Bites']
data['dish_liked']
stopwords=set(STOPWORDS)# 设置停用词
dishes=''
plt.figure(figsize=(10,20))
for word in data['dish_liked']:
words=word.split()
# Converts each token into lowercase
for i in range(len(words)):
words[i] = words[i].lower()
dishes=dishes+ " ".join(words)+" "
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,stopwords = stopwords,width=1500, height=1500).generate(dishes)
plt.imshow(wordcloud)
plt.axis("off")
df.head()
df['reviews_list'][0]
data=df['reviews_list'][0].lower()
data
import re
data2=re.sub('[^a-zA-Z]', ' ',data)
data2
data3=re.sub('rated', ' ',data2)#use space to replace rated
data3
data4=re.sub('x',' ',data3)#use space to replace x
data4
re.sub(' +',' ',data4)#use one space to replace multiple spaces
analyse review of a particular restaurant
dataset=df[df['rest_type']=='Quick Bites']
type(dataset['reviews_list'][3])
total_review=' '
for review in dataset['reviews_list']:
review=review.lower()
review=re.sub('[^a-zA-Z]', ' ',review)
review=re.sub('rated', ' ',review)
review=re.sub('x',' ',review)
review=re.sub(' +',' ',review)
total_review=total_review + str(review)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(total_review)
# plot the WordCloud image
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
def importance(restaurant):
dataset=df[df['rest_type']==restaurant]
total_review=' '
for review in dataset['reviews_list']:
review=review.lower()
review=re.sub('[^a-zA-Z]', ' ',review)
review=re.sub('rated', ' ',review)
review=re.sub('x',' ',review)
review=re.sub(' +',' ',review)
total_review=total_review + str(review)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = set(STOPWORDS),
min_font_size = 10).generate(total_review)
# plot the WordCloud image
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
importance('Quick Bites')